Data Preprocessing#

Student namen: Laiba Shamsul, Popke Snoek, Yoshi Fu, Pepeyn Velthuijse

Team nummer: G4

The following code is used to preprocess the datasets into files that are used for the data story. Note that the code is not actually executed as it would raise an exception due to the missing dataset csv file.

Note that it requires the dataset to be downloaded and inserted into the same folder as this notebook. You may find visualizations that did not make it into the actual data story. The dataset is too big to put on git, git lfs or for pandas.read_csv() to read using a share link on google drive/onedrive.

The entire cleaned dataset can be downloaded from: https://drive.google.com/file/d/1sTGPzVfk017Y8n2KPgsIxP9eXdl02YgH/view

If you want to see the code that made the cleaned dataset, then look at the ‘clean.py’ file in the github repository. FuYoshi/data_story_project

# Import packages
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
# Read the big dataset in chunks.
chunk_size = 1000000
dtypes = {"Country": str, "Mk": str, "Cn": str, "m (kg)": float, "Enedc (g/km)": float, "Ewltp (g/km)": float, "W (mm)": float, "Ft": str, "Ernedc (g/km)": float, "Erwltp (g/km)": float, "year": int}
chunk_container = pd.read_csv("CO2_data.csv", dtype=dtypes, chunksize=chunk_size)

# Split the dataset by year. 
# This ensures that the kernel does not die due to memory shortage and some plots only require data of 2021.
co2_2015 = pd.DataFrame()
co2_2016 = pd.DataFrame()
co2_2017 = pd.DataFrame()
co2_2018 = pd.DataFrame()
co2_2019 = pd.DataFrame()
co2_2020 = pd.DataFrame()
co2_2021 = pd.DataFrame()
for chunk in chunk_container:
    co2_2015 = pd.concat([co2_2015, chunk[chunk["year"] == 2015]], ignore_index=True)
    co2_2016 = pd.concat([co2_2016, chunk[chunk["year"] == 2016]], ignore_index=True)
    co2_2017 = pd.concat([co2_2017, chunk[chunk["year"] == 2017]], ignore_index=True)
    co2_2018 = pd.concat([co2_2018, chunk[chunk["year"] == 2018]], ignore_index=True)
    co2_2019 = pd.concat([co2_2019, chunk[chunk["year"] == 2019]], ignore_index=True)
    co2_2020 = pd.concat([co2_2020, chunk[chunk["year"] == 2020]], ignore_index=True)
    co2_2021 = pd.concat([co2_2021, chunk[chunk["year"] == 2021]], ignore_index=True)
co2_2021.head()
Cn Country Enedc (g/km) Ernedc (g/km) Erwltp (g/km) Ewltp (g/km) Ft Mk W (mm) m (kg) year
0 MITSUBISHI OUTLANDER IS 40.0 NaN NaN 46.0 petrol-electric Mitsubishi 2670.0 1965.0 2021
1 MITSUBISHI OUTLANDER IS 40.0 NaN NaN 46.0 petrol-electric Mitsubishi 2670.0 1965.0 2021
2 MITSUBISHI OUTLANDER IS 40.0 NaN NaN 46.0 petrol-electric Mitsubishi 2670.0 1965.0 2021
3 MITSUBISHI OUTLANDER IS 40.0 NaN NaN 46.0 petrol-electric Mitsubishi 2670.0 1965.0 2021
4 MITSUBISHI OUTLANDER IS 40.0 NaN NaN 46.0 petrol-electric Mitsubishi 2670.0 1965.0 2021
# Set to true to convert data to csv. False otherwise.
to_csv = False
# Compute data for bar graph with average CO2 emission per country
country_emission = pd.DataFrame()
for df in [co2_2015, co2_2016, co2_2017, co2_2018, co2_2019, co2_2020, co2_2021]:
    # Get subset of columns.
    df = df[["Country", "year", "Ewltp (g/km)", "Erwltp (g/km)"]]
    # Group by country and year and compute mean and std.
    df = df.groupby(["Country", "year"]).agg({"Ewltp (g/km)": ['mean', 'sum'], "Erwltp (g/km)": 'mean'}).reset_index()
    df.columns = df.columns.map(' '.join).str.strip()
    country_emission = pd.concat([country_emission, df], ignore_index=True)

if to_csv:
    country_emission.to_csv("country_emission.csv")

print(country_emission)
    Country  year  Ewltp (g/km) mean  Ewltp (g/km) sum  Erwltp (g/km) mean
0        AT  2015         167.352565         9610053.7            1.740408
1        BE  2015         168.882209         4642909.7            2.788953
2        BG  2015         179.490853          665193.1            3.076667
3        CY  2015         173.255686          251394.0                 NaN
4        CZ  2015         182.923737         2269534.8            1.707770
..      ...   ...                ...               ...                 ...
195      PT  2021         107.297623        15676290.0            1.515715
196      RO  2021         126.562358        15296959.4            1.372131
197      SE  2021          90.271425        26249034.6            1.361123
198      SI  2021         133.912345         6943489.0            1.573788
199      SK  2021         139.223296        10530989.3            1.396190

[200 rows x 5 columns]
# Plot settings
rows = 2
cols = 1
fig = make_subplots(rows=rows, cols=cols, x_title="Landcode", y_title="CO2 emissie in WLTP (g/km)")

for year in [2015, 2016, 2017, 2018, 2019, 2020, 2021]:
    # Find index of 'IE' to split the countries on ('IE' is often in the middle).
    country_emission_year = country_emission[country_emission["year"]==year].reset_index()
    split_index = country_emission_year[country_emission_year["Country"]=="IE"].index
    country_emission_split = np.split(country_emission[country_emission["year"]==year], split_index)

    # Subplot settings
    showlegend = True
    for row in range(rows):
        for col in range(cols):
            index = cols * row + col
            fig.append_trace(
                go.Bar(
                    x=country_emission_split[index]["Country"],
                    y=country_emission_split[index]["Ewltp (g/km) mean"],
                    name=year,
                    marker=dict(color=px.colors.qualitative.Plotly[year-2015]),
                    showlegend=showlegend,
                ),
                row=row + 1,
                col=col + 1,
            )
            showlegend = False

fig.update_layout(
    title_text="Visualisatie 1: CO2-uitstoot door personenauto's per land in de EU tussen 2015 en 2021",
    legend_title="Jaar",
)
fig.show()
# Plot settings
rows = 2
cols = 1
fig = make_subplots(rows=rows, cols=cols, x_title="Landcode", y_title="CO2 emissie reductie in WLTP (g/km)")

for year in [2015, 2016, 2017, 2018, 2019, 2020, 2021]:
    # Find index of 'IE' to split the countries on ('IE' is often in the middle).
    country_emission_year = country_emission[country_emission["year"]==year].reset_index()
    split_index = country_emission_year[country_emission_year["Country"]=="IE"].index
    country_emission_split = np.split(country_emission[country_emission["year"]==year], split_index)

    # Subplot settings
    for row in range(rows):
        for col in range(cols):
            index = cols * row + col
            showlegend = row == col == 0
            fig.append_trace(
                go.Bar(
                    x=country_emission_split[index]["Country"],
                    y=country_emission_split[index]["Erwltp (g/km) mean"],
                    name=year,
                    marker=dict(color=px.colors.qualitative.Plotly[year-2015]),
                    showlegend=showlegend,
                ),
                row=row + 1,
                col=col + 1,
            )

fig.update_layout(
    title_text="Visualisatie 2: Gemiddelde CO2-uitstoot vermindering vanwege innovatieve technologieën per land in de EU tussen 2019 en 2021",
    legend_title="Jaar",
)
fig.show()
# Compute pie chart with average CO2 emission per fuel type.
ft_mean_emission = co2_2021.groupby(co2_2021["Ft"]).agg({"Ewltp (g/km)": 'mean'})
# ft_mean_emission = co2_2021.groupby(co2_2021["Ft"])["Ewltp (g/km)"].mean()
ft_mean_emission = ft_mean_emission.sort_values(by="Ewltp (g/km)", ascending=False)

if to_csv:
    ft_mean_emission.to_csv("ft_mean_emission.csv")

print(ft_mean_emission)
                 Ewltp (g/km)
Ft                           
e85                159.597920
diesel             144.489682
petrol             134.432168
other              123.094150
lpg                121.112130
ng-biomethane      113.262629
ng                 104.922170
petrol-electric     41.888312
diesel-electric     39.109638
electric             0.000000
# Plot pie chart with average CO2 emission per fuel type.
fig = px.pie(ft_mean_emission,
    names=ft_mean_emission.index,
    values="Ewltp (g/km)",
    color=ft_mean_emission.index,
    title="CO2 emission by fuel type in 2021",
    hole=0.8,
    labels={
        "Ewltp (g/km)": "CO2 emission WLTP (g/km)"
    },
)
fig.update_layout(showlegend=False)
fig.update_traces(textposition='outside', textinfo="label + percent")
fig.update_layout(
    updatemenus=[
        dict(
            type = "buttons",
            direction = "left",
            buttons=list([
                dict(
                    args=["type", "pie"],
                    label="Pie",
                    method="restyle"
                ),
                dict(
                    args=["type", "bar"],
                    label="Bar",
                    method="restyle"
                )
            ]),
            showactive=True,
            x=0.11,
            xanchor="left",
            y=1.15,
            yanchor="top"
        ),
    ]
)

# Add annotation
fig.update_layout(
    annotations=[
        dict(text="Trace type:", showarrow=False, x=0, y=1.12, yref="paper", align="left")
    ]
)
fig.show()
fig = make_subplots(specs=[[{"secondary_y": True}]])

years = [2015, 2016, 2017, 2018, 2019, 2020, 2021]
eu_emission = country_emission.groupby("year").sum()

fig.add_trace(
    go.Scatter(x=years, y=eu_emission["Ewltp (g/km) mean"], name="Gemiddelde"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=years, y=eu_emission["Ewltp (g/km) sum"], name="Totaal"),
    secondary_y=True,
)

fig.update_layout(
    title_text="CO2 emissie in de EU",
    xaxis_title="Jaar",
)

fig.update_yaxes(title_text="Gemiddelde CO2 emissie in WLTP (g/km)", secondary_y=False)
fig.update_yaxes(title_text="Totale CO2 emissie in WLTP (g/km)", secondary_y=True)

fig.show()
# Lijst om de resultaten in op te slaan
counts_results = []

def classify_electric(x: str):
    if "-electric" in x:
        return "Semi-Elektrisch"
    elif "electric" in x:
        return "Elektrisch"
    else:
        return "Anders"

# Itereer over elke chunk
for chunk in pd.read_csv("CO2_data.csv", chunksize=1000000):
    # Classificeer elektrische en hybride auto's als 'Elektrisch', en alle anderen als 'Anders'
    chunk['Ft'] = chunk['Ft'].apply(classify_electric)
    # chunk['Ft'] = chunk['Ft'].apply(lambda x: 'Elektrisch' if 'electric' in x or 'hybrid' in x else 'Anders')

    # Doe de berekeningen
    count_result = chunk.groupby(['year', 'Ft'], group_keys=False).size().reset_index(name='counts')
    # Voeg het resultaat toe aan de lijst
    counts_results.append(count_result)
# Concateneer alle resultaten in een dataframe
df_counts = pd.concat(counts_results)

# Splits bij jaar en brandstoftype
df_counts = df_counts.groupby(["year", "Ft"]).agg({"counts": 'sum'}).reset_index()

# Bereken totale emissie per jaar.
df_counts["sum"] = df_counts.groupby("year")["counts"].transform('sum')

# Bereken de percentages
df_counts["percent"] = 100 * df_counts["counts"] / df_counts["sum"]

if to_csv:
    df_counts.to_csv("fuel_type_distribution.csv")

print(df_counts)
    year               Ft    counts       sum    percent
0   2015           Anders    415321    419369  99.034740
1   2015       Elektrisch      1899    419369   0.452823
2   2015  Semi-Elektrisch      2149    419369   0.512437
3   2016           Anders    473519    478728  98.911908
4   2016       Elektrisch      2452    478728   0.512191
5   2016  Semi-Elektrisch      2757    478728   0.575901
6   2017           Anders   4859439   4935537  98.458162
7   2017       Elektrisch     43266   4935537   0.876622
8   2017  Semi-Elektrisch     32832   4935537   0.665216
9   2018           Anders  14598778  14898362  97.989148
10  2018       Elektrisch    148184  14898362   0.994633
11  2018  Semi-Elektrisch    151400  14898362   1.016219
12  2019           Anders   7975216   9793017  81.437784
13  2019       Elektrisch    980563   9793017  10.012880
14  2019  Semi-Elektrisch    837238   9793017   8.549337
15  2020           Anders   7975216   9793017  81.437784
16  2020       Elektrisch    980563   9793017  10.012880
17  2020  Semi-Elektrisch    837238   9793017   8.549337
18  2021           Anders   7975216   9793017  81.437784
19  2021       Elektrisch    980563   9793017  10.012880
20  2021  Semi-Elektrisch    837238   9793017   8.549337
# Definieer kleuren voor de brandstof types
colormap = {
    "Elektrisch": 'blue',
    "Semi-Elektrisch": 'purple',
    "Anders": 'grey',
}

# Plot settings
fig = px.bar(df_counts,
    x="year",
    y="percent",
    color="Ft",
    title="Brandstof type distributie van personenauto's in de EU tussen 2015 en 2021",
    labels={
        "percent": "Percentage van alle personenauto's",
        "year": 'Jaar',
        "Ft": 'Brandstof type',
    },
    hover_data=['counts'],
    color_discrete_map=colormap  # Gebruik de kleurenkaart
)
fig.show()
total_emission_per_ft = co2_2021.groupby('Ft').agg({"Ewltp (g/km)": ['sum', 'size']})
total_emission_per_ft.columns = total_emission_per_ft.columns.map(' '.join).str.strip()
total_emission_per_ft = total_emission_per_ft.rename(columns={
    "Ewltp (g/km) sum": "Ewltp (g/km)",
    "Ewltp (g/km) size": "n",
})
total_emission_per_ft = total_emission_per_ft.sort_values(by="Ewltp (g/km)", ascending=False).reset_index()
if to_csv:
    total_emission_per_ft.to_csv("total_emission_per_ft.csv")

print(total_emission_per_ft)

# Plot settings
fig = px.bar(total_emission_per_ft,
    x="Ft",
    y="Ewltp (g/km)",
    title="Totale CO2-uitstoot van personenauto's in de EU per brandstof type (2021)",
    custom_data=[total_emission_per_ft["n"]],
    labels={
        "Ewltp (g/km)": "CO2 emissie in WLTP (g/km)",
        "Ft": "Brandstof type",
    }
)
fig.update_traces(hovertemplate="CO2 emissie in WLTP (g/km)=%{y}<br>Aantal auto's=%{customdata[0]}")
fig.show()
                Ft  Ewltp (g/km)        n
0           petrol   735257118.5  5469354
1           diesel   322991801.5  2235397
2  petrol-electric    32944236.2   786478
3              lpg    26007982.1   214743
4               ng     4101197.8    39088
5  diesel-electric     1985205.2    50760
6              e85     1150701.0     7210
7    ng-biomethane      864080.6     7629
8            other      220954.0     1795
9         electric           0.0   980563
# Compute data for emission per brand per year.
make_emission = pd.DataFrame()
for df in [co2_2015, co2_2016, co2_2017, co2_2018, co2_2019, co2_2020, co2_2021]:
    # Get subset of columns.
    df = df[["year", "Mk", "Ft", "Ewltp (g/km)"]]
    # Group by year and brand and compute mean.
    df = df.groupby(["year", "Mk"]).agg({"Ewltp (g/km)": 'mean', "Ft": 'first'}).reset_index()
    make_emission = pd.concat([make_emission, df], ignore_index=True)
print(make_emission)
     year            Mk  Ewltp (g/km)               Ft
0    2015    Alfa Romeo    158.938858           diesel
1    2015        Alpina    233.141509           petrol
2    2015  Aston Martin    410.137643           petrol
3    2015          Audi    177.115525           petrol
4    2015           BMW    179.843678           diesel
..    ...           ...           ...              ...
490  2021           UAZ    276.000000           petrol
491  2021    Volkswagen    118.445517           diesel
492  2021         Volvo    110.584278  petrol-electric
493  2021            ZD      0.000000         electric
494  2021         Škoda    126.702169           petrol

[495 rows x 4 columns]
car_emission_mass = co2_2021.sample(n=100000, replace=False)
car_emission_mass = car_emission_mass[["Ft", "Ewltp (g/km)", "m (kg)"]]
if to_csv:
    car_emission_mass.to_csv("car_emission_mass.csv")

# Plot settings
fig = px.scatter(car_emission_mass,
	x="m (kg)",
	y="Ewltp (g/km)",
	color="Ft",
)

# Create toggle visibility button for every fuel type.
buttons = []
for i, val in enumerate(car_emission_mass["Ft"].unique()):
    button = dict(method='restyle',
        label=val,
        args=[{'visible': True}, [i]],
        args2=[{'visible': "legendonly"}, [i]],
    )
    buttons.append(button)

# Menu settings
fig.update_layout(
    updatemenus=[
        dict(
            buttons=buttons,
            visible=False,
        ),
    ],
    annotations=[
        dict(text="Toggle traces by clicking the legend", x=0, xref="paper", y=1.05, yref="paper", align="right", showarrow=False),
    ],
    title_text="A really interesting title",
    legend_title="Brandstof type",
    xaxis_title="Gewicht (kg)",
    xaxis_range=(0, 3500),
    yaxis_title="CO2 emissie in WLTP (g/km)",
    yaxis_range=(0, 550),
)
fig.show()
car_emission = pd.read_csv("car_emission.csv")
df_split = np.array_split(car_emission, 8)

# Split data into different parts.
plot1 = px.scatter(df_split[0], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['indianred', 'limegreen', 'aqua'], hover_data=[df_split[0]['Ft']])
plot2 = px.scatter(df_split[1], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['darkmagenta', 'darkolivegreen', 'darkorange'], hover_data=[df_split[1]['Ft']])
plot3 = px.scatter(df_split[2], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['deeppink', 'deepskyblue', 'dimgray'], hover_data=[df_split[2]['Ft']])
plot4 = px.scatter(df_split[3], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['forestgreen', 'fuchsia', 'gainsboro'], hover_data=[df_split[3]['Ft']])
plot5 = px.scatter(df_split[4], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['lavenderblush', 'lawngreen', 'lemonchiffon'], hover_data=[df_split[4]['Ft']])
plot6 = px.scatter(df_split[5], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['magenta', 'maroon', 'mediumaquamarine'], hover_data=[df_split[5]['Ft']])
plot7 = px.scatter(df_split[6], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['midnightblue', 'goldenrod', 'mediumseagreen'], hover_data=[df_split[6]['Ft']])
plot8 = px.scatter(df_split[7], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['orangered', 'orchid', 'palegoldenrod'], hover_data=[df_split[7]['Ft']])

fig = make_subplots(rows=3, cols=3, shared_yaxes=True, x_title="Jaar", y_title="CO2 emission in NEDC (g/km)")

# Add the trace of each part to the plot.
fig.add_trace(plot1['data'][0], row=1, col=1)
fig.add_trace(plot1['data'][1], row=1, col=1)
fig.add_trace(plot1['data'][2], row=1, col=1)

fig.append_trace(plot2['data'][0], row=1, col=2)
fig.append_trace(plot2['data'][1], row=1, col=2)
fig.append_trace(plot2['data'][2], row=1, col=2)

fig.append_trace(plot3['data'][0], row=1, col=3)
fig.append_trace(plot3['data'][1], row=1, col=3)
fig.append_trace(plot3['data'][2], row=1, col=3)

fig.append_trace(plot4['data'][0], row=2, col=1)
fig.append_trace(plot4['data'][1], row=2, col=1)
fig.append_trace(plot4['data'][2], row=2, col=1)

fig.append_trace(plot5['data'][0], row=2, col=2)
fig.append_trace(plot5['data'][1], row=2, col=2)
fig.append_trace(plot5['data'][2], row=2, col=2)

fig.append_trace(plot6['data'][0], row=2, col=3)
fig.append_trace(plot6['data'][1], row=2, col=3)
fig.append_trace(plot6['data'][2], row=2, col=3)

fig.append_trace(plot7['data'][0], row=3, col=1)
fig.append_trace(plot7['data'][1], row=3, col=1)
fig.append_trace(plot7['data'][2], row=3, col=1)

fig.append_trace(plot8['data'][0], row=3, col=2)
fig.append_trace(plot8['data'][1], row=3, col=2)
fig.append_trace(plot8['data'][2], row=3, col=2)

# Plot settings
fig.update_layout(
    showlegend=True,
    height=800,
    width=800,
    title_text="Gemiddelde CO2-uitstoot van personenauto's per merk tussen 2015 en 2021",
    legend_title="Automerk",
)
fig.update_xaxes(tickangle=-45, categoryarray=[2015, 2016, 2017, 2018, 2019, 2020, 2021], type='category')
fig.show()